# Load packages
library(here)
library(dplyr)
library(readr)
library(rstan)
library(bayesrules)
library(tidyverse)
library(bayesplot)
library(rstanarm)
library(janitor)
library(tidybayes)
library(broom.mixed)
library(here)
library(sf)
library(tidycensus)
library(openxlsx)
library(s2)
library(nycgeo)
library(CARBayes)
library(spData) 
library(spdep)

nyc_join <- merge(nta_sf,nta_acs_data)


nyc_join <- nyc_join %>%
  st_transform(., 4269)

county_list <- nyc_join %>% pull(county_name) %>% unique()


census_api_key("0cc07f06386e317f312adef5e0892b0d002b7254")

census_data <- get_acs(state = "NY", 
        county = c(county_list), 
        geography = "tract", 
        variables = c(gini_inequality ="B19083_001"),
        year = 2019,
        output = "wide",
        survey = "acs5",
        geometry = TRUE) %>% 
  dplyr::select(-c(NAME, ends_with("M"))) %>%
         rename_at(vars(ends_with("E")), .funs = list(~str_sub(., end = -2)))  %>%
  st_transform(., 4269) %>%
  dplyr::select(-GEOID)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |======================================================================| 100%
# themes
theme_set(theme_minimal())
vari_names <- read_csv(here("clean_data", "nyc_names.csv"))
nyc_clean <- st_read(here("clean_data", "nyc_data.shp"), crs = 4269, quiet=T) 
colnames(nyc_clean) <- colnames(vari_names)


library(openxlsx)
nta_to_census <- openxlsx::read.xlsx(here("ethnic", "Data", "census_to_nta.xlsx")) %>%
  dplyr::select(BoroName, NTACode) %>%
  rename(borough = BoroName,
         nta_id = NTACode) %>%
  unique()


nyc_clean <- nyc_clean %>%
  merge(., nta_to_census, by="nta_id") %>%
  mutate(transportation_desert_4cat = case_when(
    transportation_desert_4cat==1 ~ "Poor",
    transportation_desert_4cat ==2 ~ "Limited",
    transportation_desert_4cat ==3 ~ "Satisfactory",
    TRUE ~ "Excellent",
  ))  %>%
  mutate(transportation_desert_4cat = factor(transportation_desert_4cat, levels=c("Poor", "Limited", "Satisfactory", "Excellent")))


assault <- st_read("/Users/freddy/Downloads/NYPD Arrest Data (Year to Date)/geo_export_a659754f-6263-4ba1-8a58-72dca5befa79.shp") %>%
    filter(str_detect(ofns_desc, "FELONY ASSAULT"))  %>%
    filter(!(str_detect(ofns_desc, "POLICE")))  %>%
    filter(str_detect(pd_desc, "2") | str_detect(pd_desc, "1") ) %>%
  dplyr::select(arrest_key,geometry) %>%
  st_transform(., 4269)
## Reading layer `geo_export_a659754f-6263-4ba1-8a58-72dca5befa79' from data source 
##   `/Users/freddy/Downloads/NYPD Arrest Data (Year to Date)/geo_export_a659754f-6263-4ba1-8a58-72dca5befa79.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 115299 features and 19 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -74.25184 ymin: 40.4994 xmax: -73.703 ymax: 40.91272
## Geodetic CRS:  WGS84(DD)
sex <- st_read("/Users/freddy/Downloads/NYPD Arrest Data (Year to Date)/geo_export_a659754f-6263-4ba1-8a58-72dca5befa79.shp") %>%
    filter(str_detect(ofns_desc, "SEX"))  %>%
  dplyr::select(arrest_key,geometry) %>%
  st_transform(., 4269)
## Reading layer `geo_export_a659754f-6263-4ba1-8a58-72dca5befa79' from data source 
##   `/Users/freddy/Downloads/NYPD Arrest Data (Year to Date)/geo_export_a659754f-6263-4ba1-8a58-72dca5befa79.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 115299 features and 19 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -74.25184 ymin: 40.4994 xmax: -73.703 ymax: 40.91272
## Geodetic CRS:  WGS84(DD)
assault_neighborhood <- st_join(nyc_clean, assault, left = TRUE) %>%
  group_by(nta_id) %>%
  summarize(assault_count=n())  %>%
  as.tibble()


sex_neighborhood <- st_join(nyc_clean, sex,   left = TRUE) %>%
  group_by(nta_id) %>%
  summarize(sexcrime_count=n())  %>%
  as.tibble()


gini_neighborhood <- st_join(nyc_clean, census_data,left = TRUE) %>%
  group_by(nta_id) %>%
  summarize(gini_neighborhood=mean(gini_inequality, na.rm=T)) %>%
  as.tibble() %>%
  dplyr::select(nta_id, gini_neighborhood) 

assault_gini <- left_join(assault_neighborhood, gini_neighborhood,by="nta_id") %>%
  mutate(gini = gini_neighborhood, .before=3)%>%
  dplyr::select(-gini_neighborhood) %>%
  dplyr::select(-geometry)

sex_assault_gini <- left_join(assault_gini, sex_neighborhood,by="nta_id") %>%
  mutate(sex_crime_count = sexcrime_count, .before=3)%>%
  dplyr::select(-sexcrime_count) %>%
  dplyr::select(-geometry)


nyc_clean <- nyc_clean %>%
  as.tibble() %>%
  filter(nta_id %in% sex_assault_gini$nta_id) %>%
  left_join(., sex_assault_gini, by="nta_id")%>% 
  unique() %>%
  st_as_sf()
subway_stations <- st_read(here("ethnic","Data","stations", "geo_export_85568705-efba-4456-bdc0-3d70ff2cf8e5.shp"), quiet=T)  %>%
  st_transform(., 4269)

bus_stations <- st_read(here("ethnic","Data","bus", "bus_stops_nyc_may2020.shp"), quiet=T)  %>%
  st_transform(., 4269)%>%
  filter(str_detect(NAMELSAD, "Richmond", negate=T))

transit_points <- read_csv(here("transit","ridership_points.csv"))%>%
  separate(Position, into=c("Point", "longitude", "latitude"), " ") %>%
  mutate(latitude = str_remove_all(latitude, "[)]"),
         longitude = str_remove_all(longitude, "[()]"),
         ) %>%
  dplyr::select(-c(Point)) %>% 
  mutate(latitude = as.numeric(latitude),
         longitude = as.numeric(longitude)) %>%
  st_as_sf(coords = c("longitude", "latitude"), crs = 4269)
#plot locations over map
subway_loc <- ggplot() +
  geom_sf(data = nyc_clean, fill = "#EBF6FF", color = "#D48DD8", size = 0.15, alpha = .8) +
  geom_sf(data = subway_stations, color="#3F123C", size=1) + 
  coord_sf(datum = st_crs(subway_stations)) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Subway Stop Locations \nin NYC")+ 
    theme(#panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

bus_loc <- ggplot() +
  geom_sf(data = nyc_clean, fill = "#EBF6FF", color = "#D48DD8", size = 0.15, alpha = .8) +
  geom_sf(data = bus_stations, color="#3F123C", size=.5, alpha=.5) + 
  coord_sf(datum = st_crs(subway_stations)) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Bus Stop Locations \nin NYC")+ 
    theme(#panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))


stops <- nyc_clean %>%
  ggplot() +
  geom_sf(aes(fill = sub_count), color = "#8f98aa") +
  scale_fill_gradient(low= "lavender", high = "maroon",
                      guide = guide_legend(title = "Number of Subway Stops") ,na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Subway Stop Counts \nin NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

bus_stops <- nyc_clean %>%
  ggplot() +
  geom_sf(aes(fill = bus_count), color = "#8f98aa") +
  scale_fill_gradient(low= "lavender", high = "maroon",
                      guide = guide_legend(title = "Number of Bus Stops") ,na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Bus Stop Counts \nin NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))


ridership <- nyc_clean%>%
  ggplot() +
  geom_sf(aes(fill = log2(mean_ridership)), color = "#8f98aa") +
  scale_fill_gradient(low= "lavender", high = "maroon",
                      guide = guide_legend(title = "Log2 Mean Ridership") ,na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Mean (Log2) Subway Turnstile \nRidership in 2018 \nfor NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

access <- nyc_clean %>%
  ggplot() +
  geom_sf(aes(fill = transportation_desert_4cat), color = "#8f98aa") +
  scale_fill_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility Category"), na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Subway Deserts \nin NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

`

red <- ggplot(nyc_clean) +
  geom_sf(aes(fill = below_poverty_line_count), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#E13728", guide = guide_legend(title = "Number Below \nPoverty Line")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Impoverishement")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

yellow <- ggplot(nyc_clean) +
  geom_sf(aes(fill = mean_income), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#F3D24E", guide = guide_legend(title = "Mean Income")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Mean Income")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12),
          legend.text = element_text(size = 12)) +
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

teal <- ggplot(nyc_clean) +
  geom_sf(aes(fill = mean_rent), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#2DBDC7", guide = guide_legend(title = "Dollars")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Mean Rent")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12),
          legend.text = element_text(size = 12)) +
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

purple <- ggplot(nyc_clean) +
  geom_sf(aes(fill = eviction_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#7826C0", guide = guide_legend(title = "Number of Evictions")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Evictions")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

orange <- ggplot(nyc_clean) +
  geom_sf(aes(fill = unemployment_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#FC9228", guide = guide_legend(title = "Number on \nUnemployment")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Unemployment")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12),
          legend.text = element_text(size = 12)) +
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

green <- ggplot(nyc_clean) +
  geom_sf(aes(fill = store_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#326902", guide = guide_legend(title = "Number of Stores")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Retail Food Stores")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

blue <- ggplot(nyc_clean) +
  geom_sf(aes(fill = school_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#5372C4", 
                      guide = guide_legend(title = "Number of Schools")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Number of Schools")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

pink <- ggplot(nyc_clean) +
  geom_sf(aes(fill = uninsured_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#F450E1", guide = guide_legend(title = "Number of People \n without Insurance Coverage")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Insurance Coverage")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

navy <- nyc_clean %>%
  ggplot() +
  geom_sf(aes(fill = gini), color = "#8f98aa")+
  scale_fill_gradient(low = "#F8E3DD", high = "#16236f",
                      guide = guide_legend(title = "Gini Inequality Values"))+
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Income Inequality")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 25, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))
white <- ggplot(nyc_clean) +
  geom_sf(aes(fill = white_count), color = "#8f98aa") +
  scale_fill_gradientn(colors = c("#FCF5EE","#BD9DA5", "#9C7080", "#7B435B"),  guide = guide_legend(title = "Number White")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("White Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

black <- ggplot(nyc_clean) +
  geom_sf(aes(fill = black_count), color = "#8f98aa") +
  scale_fill_gradientn(colors = c("#FCF5EE","#F8ABA6", "#F58581", "#F25F5C"), guide = guide_legend(title = "Number Black")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Black Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

asian <- ggplot(nyc_clean) +
  geom_sf(aes(fill = asian_count), color = "#8f98aa") +
  scale_fill_gradientn(colors = c("#FCF5EE","#B8BAD9", "#959CCE", "#717EC3"),                      guide = guide_legend(title = "Number Asian")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Asian Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

latinx <- ggplot(nyc_clean) +
  geom_sf(aes(fill = latinx_count), color = "#8f98aa")+
  scale_fill_gradientn(colors = c("#FCF5EE","#FDC894", "#FDB166", "#FC9A38"), 
                      guide = guide_legend(title = "Number Latinx")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Latinx Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

pop <- ggplot(nyc_clean) +
  geom_sf(aes(fill = total_pop), color = "#8f98aa")+
  scale_fill_gradientn(colors = c("#FCF5EE","#C0E3C3", "#A1D9AD", "#81CF97"), guide = guide_legend(title = "Number of People")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Total Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

Data Introduction

All the data used in this project are from two major sources: the Tidycensus package and NYC Open Data.

Tidycensus is an R package interface, developed by Kyle Walker and Matt Herman, that enables easy access to the US Census Bureau’s data APIs and returns Tidyverse-ready data frames from various major US Census Bureau datasets. Our demographic and socioeconomic data are drawn from the American Community Survey results found in Tidycensus package. A summary of our ACS data variables is below:

  • borough: Each Neighborhood’s Borough.
  • total_pop: Total Population by Neighborhood
  • mean_income: Mean Income by Neighborhood
  • below_poverty_line_count: Number of People Living Below the 100% Poverty Line by Neighborhood
  • mean_rent: Mean Rent by Neighborhood
  • unemployment_count: Number of People on Unemployment by Neighborhood
  • latinx_count: Number of Latinx People by Neighborhood
  • white_count: Number of White People by Neighborhood
  • black_count: Number of Black People by Neighborhood
  • native_count: Number of Native People by Neighborhood
  • asian_count: Number of Asian People by Neighborhood
  • naturalized_citizen_count: Number of Naturalized Citizens by Neighborhood
  • noncitizen_count: Number of Foreign Born People by Neighborhood
  • uninsured_count: Number of Uninsured Citizens of any Age by Neighborhood

For remaining predictors, we used NYC Open Data’s portal to identify specific predictors. In particular, we used geotagged locations of Subway Stops, Bus Stops, Grocery Stores, Schools, and Eviction Sites from the Departments of Transportation, Health, Education, and Housing to calculate neighborhood-specific variables described below:

  • school_count: Number of Public Schools by Neighborhood
  • eviction_count: Number of Evictions by Neighborhood
  • store_count: Number of Grocery Stores and Food Vendors by Neighborhood
  • sub_count: Number of Subway Stations by Neighborhood
  • bus_count: Number of Bus Stations by Neighborhood
  • perc_covered_by_transit: Percent of Neighborhood Within Walking Distance (.25 miles) of Any Subway Stop.
  • transportation_desert_4cat: Subway Accessibility by Neighborhood (None, Limited, Satisfactory, Excellent)

Lastly, we acquired subway ridership from Metropolitan Transportation Authority’s turnstile data for the week of September 7, 2019. For each station, entry/exit of each turnstile is recorded. Then, we aggregated this information by taking the station-specific average of subway ridership across the 7 days in the week. Finally, we geotagged each listed station, then took the mean of ridership at all subway stations in each neighborhood to create.

  • mean_ridership: Mean Subway Ridership by Neighborhood for the week of September 7th.







Data Summaries

Our data has 224 observations of 26 variables. Below is a preview of our dataset with colnames attached.

library(kableExtra)
kable(head(nyc_clean, n=3)) %>% kable_styling() %>% scroll_box(width = "100%", height = "200px")
nta_id total_pop mean_income below_poverty_line_count below_poverty_line_and_50_count mean_rent unemployment_count latinx_count white_count black_count native_count asian_count naturalized_citizen_count noncitizen_count uninsured_count school_count eviction_count store_count sub_count bus_count mean_ridership perc_covered_by_transit transportation_desert_4cat borough geometry assault_count sex_crime_count gini
BK0101 26308 98338.67 2397 1289 2062.667 582 3284 20526 482 40 1052 3777 3129 1797 6 68 71 2 53 9410.500 78.76390 Satisfactory Brooklyn MULTIPOLYGON (((-73.94074 4… 22 7 0.4420550
BK0102 57774 101238.92 9120 4474 2330.077 1710 18227 32237 1460 0 4008 6802 7746 3725 12 204 129 2 97 26603.000 89.80852 Satisfactory Brooklyn MULTIPOLYGON (((-73.96355 4… 14 14 0.4859323
BK0103 36891 30309.25 18285 5970 1194.875 457 3351 31799 1288 20 194 3548 1012 711 6 45 58 3 35 6348.667 88.13439 Satisfactory Brooklyn MULTIPOLYGON (((-73.96762 4… 7 1 0.4937143



Below is a numeric summary of each variable’s distribution.

#summary(nyc_clean)
library(table1)
table_print <- table1(~ total_pop + mean_income + below_poverty_line_count+ 
         mean_rent + unemployment_count + white_count + uninsured_count + school_count + eviction_count + store_count + transportation_desert_4cat+ sub_count + bus_count + mean_ridership | borough, data = nyc_clean %>% as_tibble())  %>% as_tibble() 

colnames(table_print) <- c("Variable", "Bronx (N=44)", "Brooklyn (N=64)","Manhattan (N=39)", "Queens (N=77)", "Overall (N=224)")

table_print%>%
  filter(Variable!="") %>% kable() %>% kable_styling() %>% scroll_box(width = "100%", height = "500px")
Variable Bronx (N=44) Brooklyn (N=64) Manhattan (N=39) Queens (N=77) Overall (N=224)
total_pop
  Mean (SD) 30200 (18700) 37800 (24600) 38300 (24600) 25900 (20900) 32300 (22800)
  Median [Min, Max] 29800 [0, 69200] 38300 [0, 97800] 35700 [0, 95300] 25000 [0, 87700] 31600 [0, 97800]
mean_income
  Mean (SD) 43400 (17900) 69600 (28700) 103000 (49700) 74500 (14400) 71900 (33900)
  Median [Min, Max] 38000 [23100, 94200] 61200 [27400, 148000] 108000 [33300, 212000] 72600 [37500, 104000] 67200 [23100, 212000]
  Missing 8 (18.2%) 9 (14.1%) 6 (15.4%) 19 (24.7%) 42 (18.8%)
below_poverty_line_count
  Mean (SD) 8360 (6490) 7430 (6120) 6180 (6100) 3090 (2900) 5900 (5700)
  Median [Min, Max] 7260 [0, 21600] 6880 [0, 28800] 3290 [0, 22800] 2680 [0, 11600] 4220 [0, 28800]
mean_rent
  Mean (SD) 1230 (197) 1580 (465) 1960 (674) 1650 (198) 1600 (465)
  Median [Min, Max] 1240 [833, 1620] 1450 [792, 3280] 2070 [884, 3270] 1630 [1140, 2250] 1510 [792, 3280]
  Missing 8 (18.2%) 9 (14.1%) 6 (15.4%) 19 (24.7%) 42 (18.8%)
unemployment_count
  Mean (SD) 1400 (972) 1180 (903) 1210 (1100) 756 (685) 1080 (916)
  Median [Min, Max] 1330 [0, 3150] 1120 [0, 3770] 952 [0, 4700] 668 [0, 3150] 919 [0, 4700]
white_count
  Mean (SD) 2830 (4990) 13900 (14200) 17200 (14700) 6740 (8500) 9850 (12300)
  Median [Min, Max] 919 [0, 27500] 10900 [0, 64500] 13800 [0, 69300] 4200 [0, 43900] 4960 [0, 69300]
uninsured_count
  Mean (SD) 2570 (1990) 2750 (2260) 2030 (2150) 2350 (2510) 2450 (2280)
  Median [Min, Max] 2340 [0, 8030] 2610 [0, 10100] 1380 [0, 10300] 1660 [0, 12200] 1910 [0, 12200]
school_count
  Mean (SD) 8.64 (7.44) 8.16 (6.79) 8.54 (7.79) 4.08 (2.88) 6.92 (6.41)
  Median [Min, Max] 5.50 [1.00, 27.0] 6.00 [1.00, 31.0] 5.00 [1.00, 28.0] 3.00 [1.00, 12.0] 5.00 [1.00, 31.0]
eviction_count
  Mean (SD) 438 (340) 245 (234) 223 (233) 126 (131) 238 (255)
  Median [Min, Max] 406 [1.00, 1130] 163 [1.00, 829] 152 [1.00, 1120] 93.0 [1.00, 521] 148 [1.00, 1130]
store_count
  Mean (SD) 53.2 (38.6) 69.8 (49.5) 58.6 (39.8) 33.1 (33.8) 52.0 (43.1)
  Median [Min, Max] 48.5 [1.00, 138] 70.5 [1.00, 202] 54.0 [1.00, 151] 24.0 [1.00, 147] 45.0 [1.00, 202]
transportation_desert_4cat
  Poor 4 (9.1%) 8 (12.5%) 2 (5.1%) 40 (51.9%) 54 (24.1%)
  Limited 16 (36.4%) 16 (25.0%) 2 (5.1%) 22 (28.6%) 56 (25.0%)
  Satisfactory 11 (25.0%) 18 (28.1%) 11 (28.2%) 12 (15.6%) 52 (23.2%)
  Excellent 13 (29.5%) 22 (34.4%) 24 (61.5%) 3 (3.9%) 62 (27.7%)
sub_count
  Mean (SD) 1.95 (1.33) 2.77 (2.06) 4.03 (3.53) 1.55 (1.22) 2.41 (2.23)
  Median [Min, Max] 1.00 [1.00, 7.00] 2.00 [1.00, 9.00] 3.00 [1.00, 17.0] 1.00 [1.00, 6.00] 1.00 [1.00, 17.0]
bus_count
  Mean (SD) 40.1 (22.9) 60.2 (41.1) 46.6 (26.0) 56.9 (45.0) 52.7 (38.0)
  Median [Min, Max] 43.5 [1.00, 125] 59.0 [1.00, 170] 44.0 [2.00, 106] 52.0 [1.00, 243] 49.0 [1.00, 243]
mean_ridership
  Mean (SD) 7180 (3410) 7400 (4690) 22900 (19500) 10900 (12200) 12000 (13300)
  Median [Min, Max] 6670 [2420, 15700] 6610 [1040, 26600] 18000 [5640, 110000] 7920 [273, 55700] 7960 [273, 110000]
  Missing 21 (47.7%) 18 (28.1%) 7 (17.9%) 54 (70.1%) 100 (44.6%)







Data Visuals

Non-Spatial

library(ggridges)
plot_1<-nyc_clean %>% 
  ggplot(aes(x=mean_income, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Mean Income", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_2<-nyc_clean %>% 
  ggplot(aes(x=below_poverty_line_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Number Below Poverty Line", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_3<-nyc_clean %>% 
  ggplot(aes(x=mean_rent, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Mean Rent", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_4<-nyc_clean %>% 
  ggplot(aes(x=unemployment_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Unemployed Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_5<-nyc_clean %>% 
  ggplot(aes(x=white_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="White Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_6<-nyc_clean %>% 
  ggplot(aes(x=uninsured_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Uninsured Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))


plot_7<-nyc_clean %>% 
  ggplot(aes(x=school_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="School Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_8<-nyc_clean %>% 
  ggplot(aes(x=eviction_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Eviction Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_9<-nyc_clean %>% 
  ggplot(aes(x=store_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Food Retail Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_10<-nyc_clean %>% 
  ggplot(aes(x=borough, fill=transportation_desert_4cat), alpha=.6) +
  geom_bar(position="fill") +
  scale_y_continuous(labels = seq(0, 100, by = 25)) +
  labs(title="Subway Accessibility", y="", x="")+
    theme(panel.grid.major = element_line("transparent"),
         # axis.text.y.left = element_blank(),
          axis.text.x.bottom = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold")) +
   scale_fill_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6") 

plot_11<-nyc_clean %>% 
  ggplot(aes(x=sub_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Subway Stop Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))


plot_12<-nyc_clean %>% 
  ggplot(aes(x=bus_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Bus Stop Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_13<-nyc_clean %>% 
  ggplot(aes(x=mean_ridership, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Mean Ridership", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
library(egg)
ggarrange(plot_1, plot_2, plot_3, 
          plot_4, plot_5, plot_6, 
          plot_7, plot_8, plot_9, plot_11, plot_12, 
          plot_13, plot_10,
          ncol=4)

Spatial

Ridership

library(egg)
ggarrange(subway_loc, bus_loc, stops, bus_stops, ridership, access, ncol=3)



Structural

ggarrange(red, orange, yellow, green, teal, blue, navy, purple, pink, ncol=3)



Demographic

ggarrange(pop, white, black, latinx, asian, ncol=3)

Crime

nyc_clean %>%
ggplot() +
  geom_sf(aes(fill = sex_crime_count), color = "#8f98aa")+
  scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide = guide_legend(title = "Sex-Based Crime Counts")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Sex-Based Violence")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 25, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

nyc_clean %>%
ggplot() +
  geom_sf(aes(fill = assault_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#F8E3DD", high = "#95174F", guide = guide_legend(title = "Felony Assault Counts")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("1st and 2nd Degree Felony \nAssault")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 25, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))







Demographics Models

Models 1 & 2: Non-Hierarchical

We use poisson and negative binomial regression to model observed white counts, \(i\), using our \(k=10\) predictors:

  • mean_income
  • mean_rent
  • unemployment_count
  • sub_count
  • transportation_desert_4cat
  • school_count
  • store_count
  • bus_count
  • eviction_count
  • uninsured_count

Poisson

\[\begin{split} \text{White}_{i} \mid \beta_{0}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{i}) \; \; \; \; \text{where} \log(\lambda_{i}) = \beta_{0} + \sum^{10}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \end{split}\]

poisson_non_hierarchical <- stan_glm(
  white_count ~ mean_income + mean_rent + 
    unemployment_count + 
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count,
  data = nyc_clean,
  family = poisson,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_non_hierarchical) + 
  xlab("White Resident Count") +
  labs(title = "Poisson")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_non_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(poisson_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchical Poisson - Model Summary") %>% 
  kable_styling()
Non-Hierarchical Poisson - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 3419.8903715 0.0038365 3404.9150441 3438.9527477
mean_income 0.0010119 0.0000000 0.0010055 0.0010168
mean_rent -0.0304929 0.0000036 -0.0309057 -0.0300877
unemployment_count 0.0379527 0.0000018 0.0377482 0.0381685
transportation_desert_4catLimited 71.5709164 0.0027320 71.0167804 72.1022773
transportation_desert_4catSatisfactory 79.4589797 0.0028885 78.8097506 80.0479727
transportation_desert_4catExcellent 93.6889140 0.0028636 92.9789485 94.3611356
school_count 0.1367263 0.0001385 0.1189483 0.1548387
store_count 1.0357874 0.0000282 1.0321909 1.0392870
bus_count 0.4829094 0.0000224 0.4800717 0.4854354
eviction_count -0.3405727 0.0000054 -0.3412664 -0.3398966
uninsured_count -0.0082767 0.0000005 -0.0083515 -0.0082204

Negative Binomial

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \; \; \; \; \text{where} \log(\mu_{i}) = \beta_{0} + \sum^{11}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \end{split}\]

negbin_non_hierarchical <- stan_glm(
  white_count ~  mean_income + mean_rent + 
    unemployment_count  + 
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count,
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(negbin_non_hierarchical) + 
  xlab("White Resident Count") +
  
  labs(title = "Negative Binomial")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_negbin <-  posterior_predict(
  negbin_non_hierarchical, newdata = nyc_predict_clean)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Non-Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 1903.3130070 0.4882711 1053.9749617 3690.9242147
mean_income 0.0014141 0.0000069 0.0007642 0.0022762
unemployment_count 0.0439401 0.0001686 0.0238975 0.0644908
transportation_desert_4catLimited 90.2330125 0.2297732 36.6221884 160.7311834
transportation_desert_4catSatisfactory 91.4182213 0.3000295 31.9686484 178.8491978
transportation_desert_4catExcellent 123.4284965 0.2783874 56.9296869 211.6754022
store_count 0.8923541 0.0032509 0.4765713 1.3486077
bus_count 0.7071825 0.0033973 0.1927714 1.1140631
eviction_count -0.3279124 0.0004594 -0.3830488 -0.2664993
table1 <- prediction_summary(model=poisson_non_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Poisson")
table2 <- prediction_summary(model=negbin_non_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Negative Binomial")

Negative binomial has much better error metrics.

Models 2 & 3: Hierarchy by Borough

We use poisson and negative binomial hierarchical regression to model observed white counts, \(i\), by boroughs \(j\), usin our \(k=11\) predictors. Note we are creating 3 dummy variables associated with transportation_desert_4cat:

  • mean_income
  • mean_rent
  • unemployment_count
  • transportation_desert_4cat
  • school_count
  • store_count
  • bus_count
  • eviction_count
  • uninsured_count

Poisson

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{ij}) \\ & \text{where} \log(\lambda_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \sigma_0 & \sim Exp(1) \end{split}\]

poisson_hierarchical <- stan_glmer(
 white_count ~ mean_income + mean_rent + 
    unemployment_count  + 
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count + (1 | borough),
  data = nyc_clean,
  family = poisson,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_hierarchical) + 
  xlab("White Resident Count") +
  labs(title = "Poisson")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

library(kableExtra)

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(poisson_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchicahl Poisson - Model Summary") %>% 
  kable_styling()
Hierarchicahl Poisson - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 3125.2861065 0.5888946 0.7030650 4682.1874542
mean_income 0.0007656 0.0000001 0.0007519 0.0007729
mean_rent -0.0294444 0.0000041 -0.0299564 -0.0287658
unemployment_count 0.0354221 0.0000018 0.0351726 0.0356746
transportation_desert_4catLimited 71.3728705 0.0037461 69.2576328 72.2937517
transportation_desert_4catSatisfactory 60.1612356 0.0037984 57.7672623 61.1822450
transportation_desert_4catExcellent 66.0700782 0.0041175 63.6562602 67.0730274
school_count -0.2363096 0.0001729 -0.2653370 -0.2176155
store_count 0.8335427 0.0000397 0.8278885 0.8445275
bus_count 0.5955520 0.0000333 0.5910621 0.6418625
eviction_count -0.3312319 0.0000077 -0.3337286 -0.3304165
uninsured_count -0.0063382 0.0000007 -0.0065424 -0.0062500

Negative Binomial

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \\ & \text{where} \log(\mu_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \sigma_0 & \sim Exp(1) \end{split}\]

negbin_hierarchical <- stan_glmer(
 white_count ~ mean_income + mean_rent + 
    unemployment_count + 
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count + (1 | borough),
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0)
pp_check(negbin_hierarchical) + 
  xlab("White Resident Count") +
  labs(title = "Negative Binomial")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_negbin <-  posterior_predict(
  negbin_hierarchical, newdata = nyc_predict_clean)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(negbin_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 1845.8821423 0.5406396 878.8645051 3555.6655106
mean_income 0.0014266 0.0000057 0.0006792 0.0021613
unemployment_count 0.0432225 0.0001767 0.0185096 0.0689402
transportation_desert_4catLimited 108.0384501 0.2230049 49.2121177 175.9991654
transportation_desert_4catSatisfactory 84.1263761 0.2773221 35.0198573 169.6584737
transportation_desert_4catExcellent 105.7654825 0.3111778 37.2125286 190.9217058
store_count 0.7741202 0.0026634 0.3986937 1.1628353
bus_count 0.5612179 0.0035110 0.1446237 1.0169595
eviction_count -0.2979056 0.0004969 -0.3636314 -0.2279982

Models 4 & 5: Spatial

draft <- nyc_clean %>% drop_na(mean_rent) %>% drop_na(mean_income) %>% st_as_sf()
col_sp <- as(draft, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
moran.mc(col_sp$white_count, listw = col_listw, nsim = 999, alternative = "greater")
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  col_sp$white_count 
## weights: col_listw  
## number of simulations + 1: 1000 
## 
## statistic = 0.45512, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

Poisson

col_sp <- as(draft, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure

M.burnin <- 10000       # Number of burn-in iterations (discarded)
M <- 1000               # Number of iterations retained


model.assault <- S.CARleroux(
  white_count ~ 1 + gini  + 
    mean_income + mean_rent + 
    unemployment_count  + 
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count, 
  data = draft, 
  family = "poisson",
  W = W,
  burnin = M.burnin,
  n.sample = M.burnin + M,    # Total iterations
  verbose = FALSE)

as.data.frame(model.assault$summary.results) %>% 
  rownames_to_column("term") %>%
  kable()%>% 
  kable_styling() %>% 
  scroll_box(width = "100%", height = "200px")
term Median 2.5% 97.5% n.sample % accept n.effective Geweke.diag
(Intercept) 7.1376 7.0747 7.1878 1000 43.5 3.8 -0.6
gini 4.3551 4.2764 4.4929 1000 43.5 5.1 -0.7
mean_income 0.0000 0.0000 0.0000 1000 43.5 3.9 -3.2
mean_rent -0.0006 -0.0006 -0.0005 1000 43.5 2.3 3.2
unemployment_count 0.0003 0.0002 0.0003 1000 43.5 2.4 -0.3
transportation_desert_4catLimited 0.7162 0.7022 0.7219 1000 43.5 3.6 3.2
transportation_desert_4catSatisfactory 0.1018 0.0914 0.1057 1000 43.5 4.6 -7.1
transportation_desert_4catExcellent 0.3500 0.3455 0.3591 1000 43.5 2.6 4.8
school_count 0.0088 0.0074 0.0100 1000 43.5 3.1 -3.9
store_count 0.0087 0.0083 0.0090 1000 43.5 1.9 -5.7
bus_count -0.0013 -0.0015 -0.0012 1000 43.5 7.2 4.4
eviction_count -0.0035 -0.0035 -0.0035 1000 43.5 8.8 -0.1
uninsured_count -0.0001 -0.0001 -0.0001 1000 43.5 2.7 9.4
tau2 3.0440 2.2179 4.2320 1000 100.0 126.6 -1.2
rho 0.5190 0.2848 0.7856 1000 42.3 92.2 -0.9
p_plot <- round((moran.mc(x = as.vector(model.assault$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
  

draft %>%
  mutate(resid = model.assault$residuals$response) %>%
  ggplot(aes(fill = resid), color = "#8f98aa") +
  geom_sf()+
  scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
                         guide_legend(title = "Residual")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle(paste0("1st and Degree Felony Assault: \nPoisson Model Residuals (", p_plot, ")"))+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 25, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

Zero-Inflated Poisson (Negative Binomial)

col_sp <- as(draft, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure

M.burnin <- 10000       # Number of burn-in iterations (discarded)
M <- 1000               # Number of iterations retained


model.assault <- S.CARleroux(
  white_count ~ 1 + gini  + 
    mean_income + mean_rent + 
    unemployment_count  + 
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count, 
  data=draft,
  family = "zip",
  formula.omega = ~1,
  W = W,
  burnin = M.burnin,
  n.sample = M.burnin + M,    # Total iterations
  verbose = FALSE)

as.data.frame(model.assault$summary.results) %>% 
  rownames_to_column("term") %>%
  kable()%>% 
  kable_styling() %>% 
  scroll_box(width = "100%", height = "200px")
term Median 2.5% 97.5% n.sample % accept n.effective Geweke.diag
(Intercept) 7.3812 7.3216 7.4279 1000 47.0 6.2 1.8
gini 2.2399 2.1086 2.3265 1000 47.0 5.4 -2.7
mean_income 0.0000 0.0000 0.0000 1000 47.0 2.7 2.8
mean_rent -0.0001 -0.0001 -0.0001 1000 47.0 3.9 0.9
unemployment_count 0.0004 0.0004 0.0004 1000 47.0 2.9 -3.6
transportation_desert_4catLimited 0.0517 0.0460 0.0555 1000 47.0 9.4 -4.1
transportation_desert_4catSatisfactory 0.1412 0.1327 0.1477 1000 47.0 3.6 -2.8
transportation_desert_4catExcellent 0.3954 0.3840 0.4018 1000 47.0 2.5 1.5
school_count -0.0146 -0.0154 -0.0136 1000 47.0 8.2 4.3
store_count 0.0051 0.0049 0.0052 1000 47.0 3.6 4.8
bus_count 0.0023 0.0020 0.0026 1000 47.0 2.2 5.7
eviction_count -0.0026 -0.0027 -0.0026 1000 47.0 1.5 -9.6
uninsured_count 0.0000 0.0000 0.0000 1000 47.0 2.4 -8.7
omega - (Intercept) -41363.9008 -41363.9008 -41363.9008 1000 0.0 0.0 NaN
tau2 2.7412 2.1448 3.5727 1000 100.0 306.7 -1.9
rho 0.7027 0.4675 0.8935 1000 43.1 192.8 -1.9
p_plot <- round((moran.mc(x = as.vector(model.assault$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
  

draft %>%
  mutate(resid = model.assault$residuals$response) %>%
  ggplot(aes(fill = resid), color = "#8f98aa") +
  geom_sf()+
  scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
                         guide_legend(title = "Residual")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle(paste0("1st and Degree Felony Assault: \nZIP Model Residuals (", p_plot, ")"))+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 25, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

Comparison

table3 <- prediction_summary(model=poisson_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Hierarchichal Poisson")
table4 <- prediction_summary(model=negbin_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Hierarchichal Negative Binomial") 

rbind(table1, table2, table3, table4) %>%
  dplyr::mutate(model = Model, .before=1) %>%
  dplyr::select(-Model)%>% kable() %>% kable_styling()
model mae mae_scaled within_50 within_95
Poisson 3835.725 43.613943 0.0000000 0.0083333
Negative Binomial 4416.318 0.415421 0.5916667 0.9833333
Hierarchichal Poisson 3641.245 3.293774 0.0166667 0.3666667
Hierarchichal Negative Binomial 4072.892 0.411570 0.6250000 0.9833333

Using in-sample scaled MAE, it’s evident our negative binomial regression models, regardless of hierarchy, performed better than our poisson regression models. However, the differences between the two negative binomial models was neglible. So, we will more closely interpret the non-hierarchichal negative binomial regression model.

tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Non-Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 1903.3130070 0.4882711 1053.9749617 3690.9242147
mean_income 0.0014141 0.0000069 0.0007642 0.0022762
unemployment_count 0.0439401 0.0001686 0.0238975 0.0644908
transportation_desert_4catLimited 90.2330125 0.2297732 36.6221884 160.7311834
transportation_desert_4catSatisfactory 91.4182213 0.3000295 31.9686484 178.8491978
transportation_desert_4catExcellent 123.4284965 0.2783874 56.9296869 211.6754022
store_count 0.8923541 0.0032509 0.4765713 1.3486077
bus_count 0.7071825 0.0033973 0.1927714 1.1140631
eviction_count -0.3279124 0.0004594 -0.3830488 -0.2664993

After removing predictors whose 95% credible intervals included the possibility of non-effect when controlling for other covariates, we found that there were 6 remaining predictors of an arbitrary neighborhood’s white population count:

  • Mean income by neighborhood
  • Unemployment counts by neighborhood
  • Transportation desert status by neighborhood
  • Food Vendor counts by neighborhood
  • Bus station counts by neighborhood
  • Eviction counts by neighborhood.

Next, we interpret each predictor:

  • Mean Income: When controlling for all other predictors, 1 dollar increases in mean neighborhood rental prices are associated with approximately 0.00143% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.000330%, 0.00268%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
  • Unemployment Count: When controlling for all other predictors, 1 dollar increases in unemployed people counts by neighborhood are associated with approximately 0.0493% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0140%, 0.0790%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Limited Subway Access: When controlling for all other predictors, a neighborhood with limited access to subway transit is expected to have approximately 125.753% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (51.508%, 269.541%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Satisfactory Subway Access: When controlling for all other predictors, a neighborhood with satisfactory access to subway transit is expected to have approximately 136.342% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (44.081%, 282.669%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Excellent Subway Access: When controlling for all other predictors, a neighborhood with excellent access to subway transit is expected to have approximately 98.186% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (15.762%, 275.969%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Food Vendor Count: When controlling for all other predictors, 1 store increases in the number of food vendors by neighborhood are associated with approximately 0.759% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.261%, 1.469%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Bus Stop Count: When controlling for all other predictors, 1 stop increases in the number of bus stop by neighborhood are associated with approximately 0.6516973% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0402706%, 1.1496164%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Eviction Count: When controlling for all other predictors, increases in the number of evictions by neighborhood are associated with approximately 0.340% decreases in the white population count. However, there is a 95% chance that the relationship may be any value between (-0.425%, -0.248%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.

If we intended to essentialize this list, transportation_desert_4, eviction_count, store_count seem to be the most informative predictors.

Transportation Models

Naive Bayes

# Load the package
library(e1071)

# Run the algorithm using only data on bill length
naive_model_1 <- naiveBayes(transportation_desert_4cat ~ mean_income + mean_rent + unemployment_count + school_count + store_count + bus_count + eviction_count + uninsured_count + white_count+ black_count+ latinx_count + asian_count, data = nyc_clean)

Classifications

Racial Counts

grid_data <- expand_grid(
    black_count = seq(min((nyc_clean %>% drop_na(black_count))$black_count), max((nyc_clean %>% drop_na(black_count))$black_count), length = 100),
    white_count = seq(min((nyc_clean %>% drop_na(white_count))$white_count), max((nyc_clean %>% drop_na(white_count))$white_count), length = 100)) %>%
  mutate(classification = predict(naive_model_1, newdata = .))

# Plot the classification regions
plot_1 <- ggplot(grid_data, aes(x = black_count, y = white_count, color = classification)) +
  geom_point(alpha = 1, size=1.5) +
  scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6")  +
  theme_minimal() +
  ggtitle("Black and White Population") +
  labs(x="Black Count", y="White Count", color="Subway Accessibility")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, hjust=.5, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))
grid_data <- expand_grid(
    latinx_count = seq(min((nyc_clean %>% drop_na(latinx_count))$latinx_count), max((nyc_clean %>% drop_na(latinx_count))$latinx_count), length = 100),
    white_count = seq(min((nyc_clean %>% drop_na(white_count))$white_count), max((nyc_clean %>% drop_na(white_count))$white_count), length = 100)) %>%
  mutate(classification = predict(naive_model_1, newdata = .))

# Plot the classification regions
plot_2 <- ggplot(grid_data, aes(x = latinx_count, y = white_count, color = classification)) +
  geom_point(alpha = 1, size=1.5) +
  scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6")  +
  theme_minimal() +
  ggtitle("Latinx and White Population") +
  labs(x="Latinx Count", y="White Count", color="Subway Accessibility")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, hjust=.5, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

grid_data <- expand_grid(
    asian_count = seq(min((nyc_clean %>% drop_na(asian_count))$asian_count), max((nyc_clean %>% drop_na(asian_count))$asian_count), length = 100),
    white_count = seq(min((nyc_clean %>% drop_na(white_count))$white_count), max((nyc_clean %>% drop_na(white_count))$white_count), length = 100)) %>%
  mutate(classification = predict(naive_model_1, newdata = .))

# Plot the classification regions
plot_3 <- ggplot(grid_data, aes(x = asian_count, y = white_count, color = classification)) +
  geom_point(alpha = 1, size=1.5) +
  scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6")  +
  theme_minimal() +
  ggtitle("Asian and White Population") +
  labs(x="Asian Count", y="White Count", color="Subway Accessibility")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, hjust=.5, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))
ggpubr::ggarrange(plot_1, plot_2, plot_3, ncol=3,nrow = 1, common.legend=TRUE, legend = "bottom")

Mean Income by Racial Counts

grid_data <- expand_grid(
    mean_income = seq(min((nyc_clean %>% drop_na(mean_income))$mean_income), max((nyc_clean %>% drop_na(mean_income))$mean_income), length = 100),
    white_count = seq(min((nyc_clean %>% drop_na(white_count))$white_count), max((nyc_clean %>% drop_na(white_count))$white_count), length = 100)) %>%
  mutate(classification = predict(naive_model_1, newdata = .))

# Plot the classification regions
white <- ggplot(grid_data, aes(x = mean_income, y = white_count, color = classification)) +
  geom_point(alpha = 1, size=1.5) +
  scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6")  +
  theme_minimal() +
  ggtitle("White Population") +
  labs(x="Mean Neighborhood Income", y="Count", color="Subway Accessibility")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, hjust=.5, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

grid_data <- expand_grid(
    mean_income = seq(min((nyc_clean %>% drop_na(mean_income))$mean_income), max((nyc_clean %>% drop_na(mean_income))$mean_income), length = 100),
    black_count = seq(min((nyc_clean %>% drop_na(black_count))$black_count), max((nyc_clean %>% drop_na(black_count))$black_count), length = 100)) %>%
  mutate(classification = predict(naive_model_1, newdata = .))

black <-ggplot(grid_data, aes(x = mean_income, y = black_count, color = classification)) +
  geom_point(alpha = 1, size=1.5) +
  scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6")  +
  ggtitle("Black Population")+ 
  labs(x="Mean Neighborhood Income", y="Count", color="Subway Accessibility")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, hjust=.5, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

grid_data <- expand_grid(
    mean_income = seq(min((nyc_clean %>% drop_na(mean_income))$mean_income), max((nyc_clean %>% drop_na(mean_income))$mean_income), length = 100),
    latinx_count = seq(min((nyc_clean %>% drop_na(latinx_count))$latinx_count), max((nyc_clean %>% drop_na(latinx_count))$latinx_count), length = 100)) %>%
  mutate(classification = predict(naive_model_1, newdata = .))

latinx<-ggplot(grid_data, aes(x = mean_income, y = latinx_count, color = classification)) +
  geom_point(alpha = 1, size=1.5) +
  scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6")  +
  theme_minimal() +
  ggtitle("Latinx Population")  +
  labs(x="Mean Neighborhood Income", y="Count", color="Subway Accessibility")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, hjust=.5, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

grid_data <- expand_grid(
    mean_income = seq(min((nyc_clean %>% drop_na(mean_income))$mean_income), max((nyc_clean %>% drop_na(mean_income))$mean_income), length = 100),
    asian_count = seq(min((nyc_clean %>% drop_na(asian_count))$asian_count), max((nyc_clean %>% drop_na(asian_count))$asian_count), length = 100)) %>%
  mutate(classification = predict(naive_model_1, newdata = .))

asian <- ggplot(grid_data, aes(x = mean_income, y = asian_count, color = classification)) +
  geom_point(alpha = 1, size=1.5) +
  scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6")  +
  ggtitle("Asian Population")+ 
  labs(x="Mean Neighborhood Income", y="Count", color="Subway Accessibility")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, hjust=.5, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))
ggpubr::ggarrange(white, black, latinx, asian, ncol=2,nrow = 2, common.legend=TRUE, legend = "bottom", label.x = )

Mean Income by Mean Rent

grid_data <- expand_grid(
    mean_income = seq(min((nyc_clean %>% drop_na(mean_income))$mean_income), max((nyc_clean %>% drop_na(mean_income))$mean_income), length = 100),
    mean_rent = seq(min((nyc_clean %>% drop_na(mean_rent))$mean_rent), max((nyc_clean %>% drop_na(mean_rent))$mean_rent), length = 100)) %>%
  mutate(classification = predict(naive_model_1, newdata = .))

# Plot the classification regions

ggplot(grid_data, aes(x = mean_income, y = mean_rent, color = classification)) +
  geom_point(alpha = 1, size=1) +
  scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6")  +
  theme_minimal() +
  ggtitle("Mean Rental Price") +
  labs(x="Mean Neighborhood Income", y="Mean Neighborhood Rental Price", color="Subway Accessibility")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, hjust=.5, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

takeaways:

  1. high income + high white count ~ Excellent Access
  2. low-mid white count + low mean income ~ satisfactory access
  3. low-mid white + low-mid mean income ~ Limited access
  4. low white count + low-mid mean income ~ no access







Unemployment by Eviction Counts

grid_data <- expand_grid(
    unemployment_count = seq(min((nyc_clean %>% drop_na(unemployment_count))$unemployment_count), max((nyc_clean %>% drop_na(unemployment_count))$unemployment_count), length = 100),
    eviction_count = seq(min((nyc_clean %>% drop_na(eviction_count))$eviction_count), max((nyc_clean %>% drop_na(eviction_count))$eviction_count), length = 100)) %>%
  mutate(classification = predict(naive_model_1, newdata = .))

# Plot the classification regions
ggplot(grid_data, aes(x = unemployment_count, y = eviction_count, color = classification)) +
  geom_point(alpha = 1, size=1) +
  scale_color_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6")  +
  theme_minimal() +
  ggtitle("Eviction by Unemployment Counts") +
  labs(x="Unemployment Count", y="Eviction Count", color="Subway Accessibility")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, hjust=.5, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

takeaways:

  1. high income + high white count ~ Excellent Access
  2. low-mid white count + low mean income ~ satisfactory access
  3. low-mid white + low-mid mean income ~ Limited access
  4. low white count + low-mid mean income ~ no access







Eviction Models

Models 1 & 2: Non-Hierarchical

We use poisson and negative binomial regression to model observed eviction counts, \(i\), using our \(k=10\) predictors:

  • gini
  • mean_income
  • white_count
  • black_count
  • latinx_count
  • asian_count
  • mean_rent
  • unemployment_count
  • sub_count
  • transportation_desert_4cat
  • school_count
  • store_count
  • bus_count
  • eviction_count
  • uninsured_count

Poisson

\[\begin{split} \text{White}_{i} \mid \beta_{0}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{i}) \; \; \; \; \text{where} \log(\lambda_{i}) = \beta_{0} + \sum^{10}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \end{split}\]

poisson_non_hierarchical <- stan_glm(
  eviction_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    uninsured_count,
  data = nyc_clean,
  family = poisson,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_non_hierarchical) + 
  xlab("Eviction Count") +
  xlim(0,max(nyc_clean$eviction_count))+
  labs(title = "Poisson")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_non_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(poisson_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchical Poisson - Model Summary") %>% 
  kable_styling()
Non-Hierarchical Poisson - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 55416.1150639 1.1562444 83.6981133 1.189474e+05
mean_rent -0.1833582 0.0003201 -0.3955940 -3.026100e-03
unemployment_count 0.0646197 0.0007333 0.0137323 1.156050e-01
white_count -0.0122045 0.0001759 -0.0245155 -3.029000e-04
asian_count -0.0063640 0.0000729 -0.0114315 -4.509000e-04

Negative Binomial

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \; \; \; \; \text{where} \log(\mu_{i}) = \beta_{0} + \sum^{11}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \end{split}\]

negbin_non_hierarchical <- stan_glm(
 eviction_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    uninsured_count,
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(negbin_non_hierarchical) + 
  xlab("Eviction Count") +
  xlim(0,max(nyc_clean$eviction_count))+
  labs(title = "Negative Binomial")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_negbin <-  posterior_predict(
  negbin_non_hierarchical, newdata = nyc_predict_clean)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Non-Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 30.5453872 0.5287620 15.0869470 59.1729909
gini 2418.1203641 1.2661393 475.1531497 10412.0972367
unemployment_count 0.0200076 0.0000906 0.0088465 0.0317101
black_count 0.0035390 0.0000074 0.0025635 0.0043668
latinx_count 0.0029908 0.0000078 0.0018979 0.0041004
transportation_desert_4catLimited 28.8465194 0.1195558 9.1149525 50.1353776
transportation_desert_4catSatisfactory 17.4168094 0.1448793 0.4915411 41.4977460
transportation_desert_4catExcellent 32.1688719 0.1484052 10.7543255 59.1151327
school_count -1.7919412 0.0090439 -2.9495273 -0.5244692
store_count 0.2906715 0.0021231 0.0647539 0.5353725
table1 <- prediction_summary(model=poisson_non_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Poisson")
table2 <- prediction_summary(model=negbin_non_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Negative Binomial")

Negative binomial has much better error metrics.

Models 2 & 3: Hierarchy by Borough

We use poisson and negative binomial hierarchical regression to model observed white counts, \(i\), by boroughs \(j\), usin our \(k=11\) predictors. Note we are creating 3 dummy variables associated with transportation_desert_4cat:

  • mean_income
  • mean_rent
  • unemployment_count
  • transportation_desert_4cat
  • school_count
  • store_count
  • bus_count
  • eviction_count
  • uninsured_count

Poisson

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{ij}) \\ & \text{where} \log(\lambda_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \sigma_0 & \sim Exp(1) \end{split}\]

poisson_hierarchical <- stan_glmer(
  eviction_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    uninsured_count + (1 | borough),
  data = nyc_clean,
  family = poisson,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_hierarchical) + 
  xlab("Eviction Count") +
  xlim(0,max(nyc_clean$eviction_count))+
  labs(title = "Poisson")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

library(kableExtra)

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(poisson_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchicahl Poisson - Model Summary") %>% 
  kable_styling()
Hierarchicahl Poisson - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 809.6633508 4.5449121 21.8424000 63684.3915131
mean_income -0.0005827 0.0000016 -0.0010228 -0.0002919
mean_rent -0.1117542 0.0015536 -0.2276449 -0.0028474
unemployment_count 0.0471095 0.0005280 0.0117567 0.1071384
asian_count -0.0066243 0.0000934 -0.0130425 -0.0002948
transportation_desert_4catLimited 27.8311051 0.0345974 21.3364953 31.8488463

Negative Binomial

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \\ & \text{where} \log(\mu_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \sigma_0 & \sim Exp(1) \end{split}\]

negbin_hierarchical <- stan_glmer(
eviction_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    uninsured_count+ (1 | borough),
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0)
pp_check(negbin_hierarchical) + 
  xlab("Eviction Count") +
  xlim(0,max(nyc_clean$eviction_count))+
  labs(title = "Negative Binomial")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_negbin <-  posterior_predict(
  negbin_hierarchical, newdata = nyc_predict_clean)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(negbin_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 53.0835129 0.7801604 20.5159984 139.4805573
gini 700.8616334 1.5425202 27.5405436 4094.9143873
mean_rent -0.0353251 0.0002534 -0.0621987 -0.0022165
white_count 0.0005786 0.0000039 0.0000139 0.0011426
black_count 0.0037818 0.0000060 0.0031138 0.0046892
latinx_count 0.0015668 0.0000084 0.0006714 0.0026564
transportation_desert_4catLimited 29.4117040 0.1157931 12.1793852 48.5268663
transportation_desert_4catSatisfactory 22.8799647 0.1265743 2.4483039 42.1049581
transportation_desert_4catExcellent 41.8088079 0.1470004 18.2539168 70.2206680
school_count -1.3885206 0.0088653 -2.5049046 -0.2472096
store_count 0.4476516 0.0019607 0.2200616 0.7065064

Models 4 & 5: Spatial

col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
moran.mc(col_sp$eviction_count, listw = col_listw, nsim = 999, alternative = "greater")
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  col_sp$eviction_count 
## weights: col_listw  
## number of simulations + 1: 1000 
## 
## statistic = 0.47185, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

Poisson

col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure

M.burnin <- 10000       # Number of burn-in iterations (discarded)
M <- 1000               # Number of iterations retained


model.eviction <- S.CARleroux(
  eviction_count ~ 1 + gini  + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count +  uninsured_count, 
  data = nyc_clean, 
  family = "poisson",
  W = W,
  burnin = M.burnin,
  n.sample = M.burnin + M,    # Total iterations
  verbose = FALSE)

as.data.frame(model.eviction$summary.results) %>% 
  rownames_to_column("term") %>%
  kable()%>% 
  kable_styling() %>% 
  scroll_box(width = "100%", height = "200px")
term Median 2.5% 97.5% n.sample % accept n.effective Geweke.diag
(Intercept) 0.9732 0.4840 1.2552 1000 44.0 2.6 3.8
gini 2.8176 2.2955 3.7680 1000 44.0 2.7 -3.2
unemployment_count 0.0007 0.0007 0.0008 1000 44.0 2.3 7.1
white_count 0.0000 0.0000 0.0000 1000 44.0 4.7 -1.3
black_count 0.0000 0.0000 0.0000 1000 44.0 4.0 2.0
latinx_count 0.0000 0.0000 0.0001 1000 44.0 7.8 1.7
asian_count 0.0000 0.0000 0.0000 1000 44.0 3.4 -4.7
transportation_desert_4catLimited 0.0251 -0.0213 0.0847 1000 44.0 3.9 -2.4
transportation_desert_4catSatisfactory 0.3465 0.3192 0.3670 1000 44.0 8.3 1.4
transportation_desert_4catExcellent 0.1966 0.1065 0.2292 1000 44.0 3.4 1.1
school_count 0.0074 0.0009 0.0174 1000 44.0 2.6 -5.4
store_count 0.0087 0.0078 0.0093 1000 44.0 5.6 0.1
uninsured_count -0.0001 -0.0001 -0.0001 1000 44.0 1.7 -1.9
tau2 2.7575 1.9175 3.8906 1000 100.0 64.7 -1.2
rho 0.3607 0.1643 0.5784 1000 49.5 57.1 -0.7
p_plot <- round((moran.mc(x = as.vector(model.eviction$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
  

nyc_clean %>%
  mutate(resid = model.eviction$residuals$response) %>%
  ggplot(aes(fill = resid), color = "#8f98aa") +
  geom_sf()+
  scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
                         guide_legend(title = "Residual")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle(paste0("Eviction: \nPoisson Model Residuals (", p_plot, ")"))+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 25, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

Zero-Inflated Poisson (Negative Binomial)

draft <- nyc_clean

col_sp <- as(draft, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure

M.burnin <- 10000       # Number of burn-in iterations (discarded)
M <- 1000               # Number of iterations retained

model.eviction <- S.CARleroux(
  eviction_count ~ 1 + gini  + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count +  uninsured_count, 
  data = nyc_clean, 
  family = "zip",
  formula.omega = ~1,
  W = W,
  burnin = M.burnin,
  n.sample = M.burnin + M,    # Total iterations
  verbose = FALSE)

as.data.frame(model.eviction$summary.results) %>% 
  rownames_to_column("term") %>%
  filter(Median != 0 ) %>%
  filter(`2.5%` > 0 & `97.5%` > 0 | `2.5%`  < 0 & `97.5%` < 0) %>%  
  kable()%>% 
  kable_styling() %>% 
  scroll_box(width = "100%", height = "200px")
term Median 2.5% 97.5% n.sample % accept n.effective Geweke.diag
(Intercept) 3.0805 2.8014 3.4537 1000 46.0 2.9 5.0
unemployment_count 0.0004 0.0004 0.0005 1000 46.0 1.4 -1.8
asian_count 0.0001 0.0001 0.0001 1000 46.0 9.8 2.3
transportation_desert_4catLimited -0.2499 -0.3303 -0.1594 1000 46.0 1.2 -5.9
transportation_desert_4catSatisfactory -0.4019 -0.4304 -0.3651 1000 46.0 7.6 4.9
transportation_desert_4catExcellent -0.2977 -0.3433 -0.2393 1000 46.0 3.2 -0.5
school_count -0.0212 -0.0244 -0.0149 1000 46.0 8.9 -2.2
store_count 0.0108 0.0098 0.0122 1000 46.0 2.9 -4.2
omega - (Intercept) 46600.1794 46600.1794 46600.1794 1000 0.0 0.0 NaN
tau2 4.1346 3.0057 5.6135 1000 100.0 112.3 -1.8
rho 0.6653 0.3936 0.9241 1000 49.9 86.6 -0.4
# p_plot <- round((moran.mc(x = as.vector(model.eviction$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)


 nyc_clean %>%
   mutate(resid = model.eviction$residuals$response) %>%
   ggplot(aes(fill = resid), color = "#8f98aa") +
   geom_sf()+
   scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
                          guide_legend(title = "Residual")) +
   theme_minimal() +
   theme(panel.grid.major = element_line("transparent"),
         axis.text = element_blank()) +
   ggtitle(paste0("Eviction: \nZIP Model Residuals ( p_plot)"))+
     theme(panel.grid.major = element_line("transparent"),
           plot.title = element_text(size = 25, face = "bold"),
           legend.title = element_text(size = 12),
           legend.text = element_text(size = 12)) +
     guides(shape = guide_legend(override.aes = list(size = 8)),
            color = guide_legend(override.aes = list(size = 8)))

Comparison

table3 <- prediction_summary(model=poisson_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Hierarchichal Poisson")
table4 <- prediction_summary(model=negbin_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Hierarchichal Negative Binomial") 

rbind(table1, table2, table3, table4) %>%
  dplyr::mutate(model = Model, .before=1) %>%
  dplyr::select(-Model)%>% kable() %>% kable_styling()
model mae mae_scaled within_50 within_95
Poisson 124.1150 1.1197121 0.5083333 0.6416667
Negative Binomial 58.9475 0.4954948 0.6166667 0.9666667
Hierarchichal Poisson 107.6275 0.9523888 0.5333333 0.6666667
Hierarchichal Negative Binomial 55.4150 0.5521519 0.5833333 0.9833333

Using in-sample scaled MAE, it’s evident our negative binomial regression models, regardless of hierarchy, performed better than our poisson regression models. However, the differences between the two negative binomial models was neglible. So, we will more closely interpret the non-hierarchichal negative binomial regression model.

tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Non-Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 30.5453872 0.5287620 15.0869470 59.1729909
gini 2418.1203641 1.2661393 475.1531497 10412.0972367
unemployment_count 0.0200076 0.0000906 0.0088465 0.0317101
black_count 0.0035390 0.0000074 0.0025635 0.0043668
latinx_count 0.0029908 0.0000078 0.0018979 0.0041004
transportation_desert_4catLimited 28.8465194 0.1195558 9.1149525 50.1353776
transportation_desert_4catSatisfactory 17.4168094 0.1448793 0.4915411 41.4977460
transportation_desert_4catExcellent 32.1688719 0.1484052 10.7543255 59.1151327
school_count -1.7919412 0.0090439 -2.9495273 -0.5244692
store_count 0.2906715 0.0021231 0.0647539 0.5353725

After removing predictors whose 95% credible intervals included the possibility of non-effect when controlling for other covariates, we found that there were 6 remaining predictors of an arbitrary neighborhood’s felony eviction counts:

  • Mean income by neighborhood
  • Unemployment counts by neighborhood
  • Transportation desert status by neighborhood
  • Food Vendor counts by neighborhood
  • Bus station counts by neighborhood
  • Eviction counts by neighborhood.

Next, we interpret each predictor:

  • Mean Income: When controlling for all other predictors, 1 dollar increases in mean neighborhood rental prices are associated with approximately 0.00143% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.000330%, 0.00268%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
  • Unemployment Count: When controlling for all other predictors, 1 dollar increases in unemployed people counts by neighborhood are associated with approximately 0.0493% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0140%, 0.0790%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Limited Subway Access: When controlling for all other predictors, a neighborhood with limited access to subway transit is expected to have approximately 125.753% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (51.508%, 269.541%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Satisfactory Subway Access: When controlling for all other predictors, a neighborhood with satisfactory access to subway transit is expected to have approximately 136.342% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (44.081%, 282.669%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Excellent Subway Access: When controlling for all other predictors, a neighborhood with excellent access to subway transit is expected to have approximately 98.186% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (15.762%, 275.969%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Food Vendor Count: When controlling for all other predictors, 1 store increases in the number of food vendors by neighborhood are associated with approximately 0.759% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.261%, 1.469%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Bus Stop Count: When controlling for all other predictors, 1 stop increases in the number of bus stop by neighborhood are associated with approximately 0.6516973% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0402706%, 1.1496164%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Eviction Count: When controlling for all other predictors, increases in the number of evictions by neighborhood are associated with approximately 0.340% decreases in the white population count. However, there is a 95% chance that the relationship may be any value between (-0.425%, -0.248%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.

If we intended to essentialize this list, transportation_desert_4, eviction_count, store_count seem to be the most informative predictors.

Assault Models

Models 1 & 2: Non-Hierarchical

We use poisson and negative binomial regression to model observed assault counts, \(i\), using our \(k=10\) predictors:

  • gini
  • mean_income
  • white_count
  • black_count
  • latinx_count
  • asian_count
  • mean_rent
  • unemployment_count
  • sub_count
  • transportation_desert_4cat
  • school_count
  • store_count
  • bus_count
  • eviction_count
  • uninsured_count

Poisson

\[\begin{split} \text{White}_{i} \mid \beta_{0}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{i}) \; \; \; \; \text{where} \log(\lambda_{i}) = \beta_{0} + \sum^{10}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \end{split}\]

poisson_non_hierarchical <- stan_glm(
  assault_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count,
  data = nyc_clean,
  family = poisson,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_non_hierarchical) + 
  xlab("Felon Assault Count") +
  xlim(0,max(nyc_clean$assault_count))+
  labs(title = "Poisson")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_non_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(poisson_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchical Poisson - Model Summary") %>% 
  kable_styling()
Non-Hierarchical Poisson - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 3.5864689 0.1983254 2.7327398 4.6456268
gini 5396.0460106 0.4182073 3285.4646368 9804.7747080
mean_income 0.0002932 0.0000012 0.0001445 0.0004605
mean_rent -0.0448781 0.0000858 -0.0565014 -0.0344047
white_count -0.0014579 0.0000014 -0.0016540 -0.0012487
latinx_count 0.0007796 0.0000025 0.0003967 0.0010653
transportation_desert_4catLimited 60.2880345 0.0531195 50.1488375 69.0748289
transportation_desert_4catSatisfactory 29.7228730 0.0601048 20.0323022 39.9971227
transportation_desert_4catExcellent 73.5251029 0.0594743 60.9106951 85.1933438
store_count 0.5518890 0.0005727 0.4844971 0.6122244
bus_count 0.5740996 0.0004254 0.5258629 0.6227106
eviction_count 0.0421430 0.0000887 0.0312818 0.0558097

Negative Binomial

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \; \; \; \; \text{where} \log(\mu_{i}) = \beta_{0} + \sum^{11}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \end{split}\]

negbin_non_hierarchical <- stan_glm(
 assault_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count,
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(negbin_non_hierarchical) + 
  xlab("Felony Assualt Count") +
  xlim(0,max(nyc_clean$assault_count))+
  labs(title = "Negative Binomial")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_negbin <-  posterior_predict(
  negbin_non_hierarchical, newdata = nyc_predict_clean)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Non-Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 2.1998363 0.8420645 0.6470274 7.362050e+00
gini 9155.7878631 1.9906985 541.5958625 1.066761e+05
mean_income 0.0006529 0.0000046 0.0001557 1.293700e-03
mean_rent -0.0700853 0.0003057 -0.1100915 -3.038670e-02
white_count -0.0016060 0.0000072 -0.0026489 -7.126000e-04
transportation_desert_4catLimited 70.4209661 0.1998572 33.6839614 1.305728e+02
transportation_desert_4catExcellent 87.7150816 0.2308285 29.8265673 1.650792e+02
school_count 1.7349760 0.0131659 0.1842995 3.408764e+00
store_count 0.7203276 0.0033028 0.3282823 1.123578e+00
bus_count 0.7038286 0.0023984 0.3565005 9.673685e-01
table1 <- prediction_summary(model=poisson_non_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Poisson")
table2 <- prediction_summary(model=negbin_non_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Negative Binomial")

Negative binomial has much better error metrics.

Models 2 & 3: Hierarchy by Borough

We use poisson and negative binomial hierarchical regression to model observed white counts, \(i\), by boroughs \(j\), usin our \(k=11\) predictors. Note we are creating 3 dummy variables associated with transportation_desert_4cat:

  • mean_income
  • mean_rent
  • unemployment_count
  • transportation_desert_4cat
  • school_count
  • store_count
  • bus_count
  • eviction_count
  • uninsured_count

Poisson

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{ij}) \\ & \text{where} \log(\lambda_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \sigma_0 & \sim Exp(1) \end{split}\]

poisson_hierarchical <- stan_glmer(
  assault_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count + (1 | borough),
  data = nyc_clean,
  family = poisson,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_hierarchical) + 
  xlab("Felony Assault Count") +
  xlim(0,max(nyc_clean$assault_count))+
  labs(title = "Poisson")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

library(kableExtra)

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(poisson_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchicahl Poisson - Model Summary") %>% 
  kable_styling()
Hierarchicahl Poisson - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 2.604147e+05 13.9154929 6.2855040 2.641469e+09
unemployment_count -1.019620e-02 0.0000182 -0.0112094 -4.090700e-03
white_count -1.161110e-02 0.0001479 -0.0215374 -1.258900e-03
school_count 5.656243e+00 0.0685694 0.4419139 1.066438e+01
bus_count 3.083773e+00 0.0363440 0.5476908 5.656987e+00
eviction_count 2.645923e-01 0.0029338 0.0488013 4.619764e-01
uninsured_count 5.206250e-02 0.0007241 0.0008345 1.009630e-01

Negative Binomial

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \\ & \text{where} \log(\mu_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \sigma_0 & \sim Exp(1) \end{split}\]

negbin_hierarchical <- stan_glmer(
assault_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count+ (1 | borough),
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0)
pp_check(negbin_hierarchical) + 
  xlab("Felony Assault Count") +
  xlim(0,max(nyc_clean$assault_count))+
  labs(title = "Negative Binomial")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_negbin <-  posterior_predict(
  negbin_hierarchical, newdata = nyc_predict_clean)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(negbin_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 5.3270832 1.0967411 1.6974017 23.5092739
mean_rent -0.0739410 0.0003625 -0.1154820 -0.0266327
white_count -0.0014585 0.0000071 -0.0024297 -0.0006557
transportation_desert_4catLimited 80.6050068 0.1977024 38.4474675 132.9241017
transportation_desert_4catExcellent 101.3320462 0.2694313 41.4451363 179.5988054
school_count 2.3608552 0.0140133 0.6749783 4.1613701
store_count 0.8449207 0.0030550 0.4437088 1.2270165
bus_count 0.6870714 0.0028121 0.3789878 1.0114716

Models 4 & 5: Spatial

col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
moran.mc(col_sp$assault_count, listw = col_listw, nsim = 999, alternative = "greater")
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  col_sp$assault_count 
## weights: col_listw  
## number of simulations + 1: 1000 
## 
## statistic = 0.19199, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater

Poisson

col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure

M.burnin <- 10000       # Number of burn-in iterations (discarded)
M <- 1000               # Number of iterations retained


model.assault <- S.CARleroux(
  assault_count ~ 1 + gini  + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count +  eviction_count + uninsured_count, 
  data = nyc_clean, 
  family = "poisson",
  W = W,
  burnin = M.burnin,
  n.sample = M.burnin + M,    # Total iterations
  verbose = FALSE)

as.data.frame(model.assault$summary.results) %>% 
  rownames_to_column("term") %>%
  kable()%>% 
  kable_styling() %>% 
  scroll_box(width = "100%", height = "200px")
term Median 2.5% 97.5% n.sample % accept n.effective Geweke.diag
(Intercept) -1.9999 -2.6628 -1.3056 1000 44.6 4.8 0.8
gini 7.2529 5.6065 8.4223 1000 44.6 4.6 -0.4
unemployment_count 0.0000 -0.0001 0.0001 1000 44.6 6.2 -4.8
white_count 0.0000 0.0000 0.0000 1000 44.6 2.5 -0.7
black_count 0.0000 0.0000 0.0000 1000 44.6 3.8 0.8
latinx_count 0.0000 0.0000 0.0001 1000 44.6 1.6 6.9
asian_count 0.0000 0.0000 0.0000 1000 44.6 10.5 -2.0
transportation_desert_4catLimited 0.4898 0.3948 0.6482 1000 44.6 1.8 6.0
transportation_desert_4catSatisfactory 0.1118 -0.0032 0.2352 1000 44.6 3.2 -3.7
transportation_desert_4catExcellent 0.3385 0.2085 0.4497 1000 44.6 2.1 -9.4
school_count 0.0090 -0.0045 0.0174 1000 44.6 3.9 -3.2
store_count 0.0047 0.0030 0.0070 1000 44.6 2.7 -1.9
eviction_count 0.0004 0.0000 0.0005 1000 44.6 4.1 2.3
uninsured_count 0.0000 0.0000 0.0000 1000 44.6 10.1 -2.1
tau2 0.8795 0.6206 1.3123 1000 100.0 57.0 1.5
rho 0.0651 0.0040 0.1934 1000 44.9 57.7 1.2
p_plot <- round((moran.mc(x = as.vector(model.assault$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
  

nyc_clean %>%
  mutate(resid = model.assault$residuals$response) %>%
  ggplot(aes(fill = resid), color = "#8f98aa") +
  geom_sf()+
  scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
                         guide_legend(title = "Residual")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle(paste0("1st and Degree Felony Assault: \nPoisson Model Residuals (", p_plot, ")"))+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 25, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

Zero-Inflated Poisson (Negative Binomial)

draft <- nyc_clean

col_sp <- as(draft, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure

M.burnin <- 10000       # Number of burn-in iterations (discarded)
M <- 1000               # Number of iterations retained

model.assault <- S.CARleroux(
  assault_count ~ 1 + gini  + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count +  eviction_count + uninsured_count, 
  data = nyc_clean, 
  family = "zip",
  formula.omega = ~1,
  W = W,
  burnin = M.burnin,
  n.sample = M.burnin + M,    # Total iterations
  verbose = FALSE)

as.data.frame(model.assault$summary.results) %>% 
  rownames_to_column("term") %>%
  filter(Median != 0 ) %>%
  filter(`2.5%` > 0 & `97.5%` > 0 | `2.5%`  < 0 & `97.5%` < 0) %>%  
  kable()%>% 
  kable_styling() %>% 
  scroll_box(width = "100%", height = "200px")
term Median 2.5% 97.5% n.sample % accept n.effective Geweke.diag
(Intercept) -1.9043 -3.1216 -1.1967 1000 47.0 2.9 0.7
gini 7.0408 5.0297 9.3690 1000 47.0 2.8 -0.4
latinx_count 0.0001 0.0001 0.0001 1000 47.0 10.8 2.6
transportation_desert_4catLimited 0.4587 0.3100 0.5598 1000 47.0 2.1 -9.1
transportation_desert_4catSatisfactory 0.1377 0.0744 0.1956 1000 47.0 7.5 2.3
transportation_desert_4catExcellent 0.2915 0.1986 0.3877 1000 47.0 3.3 4.8
school_count -0.0153 -0.0267 -0.0041 1000 47.0 3.2 -5.0
store_count 0.0131 0.0110 0.0150 1000 47.0 4.6 2.7
eviction_count 0.0006 0.0003 0.0008 1000 47.0 3.9 1.0
uninsured_count -0.0002 -0.0002 -0.0002 1000 47.0 3.1 -10.5
omega - (Intercept) -11809.4038 -11809.4038 -11809.4038 1000 0.0 0.0 NaN
tau2 0.8459 0.6464 1.1860 1000 100.0 119.5 0.2
rho 0.0452 0.0029 0.1449 1000 47.9 95.9 0.2
# p_plot <- round((moran.mc(x = as.vector(model.assault$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
  

# nyc_clean %>%
#   mutate(resid = model.assault$residuals$response) %>%
#   ggplot(aes(fill = resid), color = "#8f98aa") +
#   geom_sf()+
#   scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
#                          guide_legend(title = "Residual")) +
#   theme_minimal() +
#   theme(panel.grid.major = element_line("transparent"),
#         axis.text = element_blank()) +
#   ggtitle(paste0("1st and Degree Felony Assault: \nZIP Model Residuals (", p_plot, ")"))+ 
#     theme(panel.grid.major = element_line("transparent"),
#           plot.title = element_text(size = 25, face = "bold"),
#           legend.title = element_text(size = 12), 
#           legend.text = element_text(size = 12)) + 
#     guides(shape = guide_legend(override.aes = list(size = 8)),
#            color = guide_legend(override.aes = list(size = 8)))

Comparison

table3 <- prediction_summary(model=poisson_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Hierarchichal Poisson")
table4 <- prediction_summary(model=negbin_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Hierarchichal Negative Binomial") 

rbind(table1, table2, table3, table4) %>%
  dplyr::mutate(model = Model, .before=1) %>%
  dplyr::select(-Model)%>% kable() %>% kable_styling()
model mae mae_scaled within_50 within_95
Poisson 19.7075 2.7633695 0.1416667 0.3333333
Negative Binomial 20.6150 0.6005202 0.5583333 0.9833333
Hierarchichal Poisson 29.5525 0.9424668 0.5500000 0.6500000
Hierarchichal Negative Binomial 20.4425 0.6541777 0.5000000 0.9666667

Using in-sample scaled MAE, it’s evident our negative binomial regression models, regardless of hierarchy, performed better than our poisson regression models. However, the differences between the two negative binomial models was neglible. So, we will more closely interpret the non-hierarchichal negative binomial regression model.

tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Non-Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 2.1998363 0.8420645 0.6470274 7.362050e+00
gini 9155.7878631 1.9906985 541.5958625 1.066761e+05
mean_income 0.0006529 0.0000046 0.0001557 1.293700e-03
mean_rent -0.0700853 0.0003057 -0.1100915 -3.038670e-02
white_count -0.0016060 0.0000072 -0.0026489 -7.126000e-04
transportation_desert_4catLimited 70.4209661 0.1998572 33.6839614 1.305728e+02
transportation_desert_4catExcellent 87.7150816 0.2308285 29.8265673 1.650792e+02
school_count 1.7349760 0.0131659 0.1842995 3.408764e+00
store_count 0.7203276 0.0033028 0.3282823 1.123578e+00
bus_count 0.7038286 0.0023984 0.3565005 9.673685e-01

After removing predictors whose 95% credible intervals included the possibility of non-effect when controlling for other covariates, we found that there were 6 remaining predictors of an arbitrary neighborhood’s felony assault counts:

  • Mean income by neighborhood
  • Unemployment counts by neighborhood
  • Transportation desert status by neighborhood
  • Food Vendor counts by neighborhood
  • Bus station counts by neighborhood
  • Eviction counts by neighborhood.

Next, we interpret each predictor:

  • Mean Income: When controlling for all other predictors, 1 dollar increases in mean neighborhood rental prices are associated with approximately 0.00143% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.000330%, 0.00268%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
  • Unemployment Count: When controlling for all other predictors, 1 dollar increases in unemployed people counts by neighborhood are associated with approximately 0.0493% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0140%, 0.0790%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Limited Subway Access: When controlling for all other predictors, a neighborhood with limited access to subway transit is expected to have approximately 125.753% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (51.508%, 269.541%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Satisfactory Subway Access: When controlling for all other predictors, a neighborhood with satisfactory access to subway transit is expected to have approximately 136.342% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (44.081%, 282.669%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Excellent Subway Access: When controlling for all other predictors, a neighborhood with excellent access to subway transit is expected to have approximately 98.186% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (15.762%, 275.969%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Food Vendor Count: When controlling for all other predictors, 1 store increases in the number of food vendors by neighborhood are associated with approximately 0.759% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.261%, 1.469%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Bus Stop Count: When controlling for all other predictors, 1 stop increases in the number of bus stop by neighborhood are associated with approximately 0.6516973% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0402706%, 1.1496164%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Eviction Count: When controlling for all other predictors, increases in the number of evictions by neighborhood are associated with approximately 0.340% decreases in the white population count. However, there is a 95% chance that the relationship may be any value between (-0.425%, -0.248%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.

If we intended to essentialize this list, transportation_desert_4, eviction_count, store_count seem to be the most informative predictors.

Sex-Violence Models

Models 1 & 2: Non-Hierarchical

We use poisson and negative binomial regression to model observed assault counts, \(i\), using our \(k=10\) predictors:

  • gini
  • mean_income
  • white_count
  • black_count
  • latinx_count
  • asian_count
  • mean_rent
  • unemployment_count
  • sub_count
  • transportation_desert_4cat
  • school_count
  • store_count
  • bus_count
  • eviction_count
  • uninsured_count

Poisson

\[\begin{split} \text{White}_{i} \mid \beta_{0}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{i}) \; \; \; \; \text{where} \log(\lambda_{i}) = \beta_{0} + \sum^{10}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \end{split}\]

poisson_non_hierarchical <- stan_glm(
  sex_crime_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count,
  data = nyc_clean,
  family = poisson,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_non_hierarchical) + 
  xlab("Sex-Based Crimes Count") +
  xlim(0,max(nyc_clean$sex_crime_count))+
  labs(title = "Poisson")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_non_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(poisson_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchical Poisson - Model Summary") %>% 
  kable_styling()
Non-Hierarchical Poisson - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 0.4120347 0.4006631 0.2273258 0.6758560
gini 3587.9061283 0.8780549 1198.0715235 11249.5836798
mean_income -0.0006794 0.0000028 -0.0009775 -0.0001463
unemployment_count -0.0354126 0.0000596 -0.0422560 -0.0281694
white_count 0.0013926 0.0000022 0.0010219 0.0017033
black_count 0.0019501 0.0000041 0.0013431 0.0024266
latinx_count 0.0039675 0.0000048 0.0031338 0.0046113
asian_count 0.0054798 0.0000048 0.0045999 0.0060160
transportation_desert_4catLimited 173.1518336 0.1147418 140.5511570 219.2276179
transportation_desert_4catSatisfactory 84.1372776 0.1358524 54.0562285 117.7787728
transportation_desert_4catExcellent 201.2982840 0.1286909 161.8717676 258.7494905
school_count 0.6618839 0.0046520 0.0790657 1.4987590
store_count 0.1256186 0.0010681 0.0109349 0.2731614
bus_count 0.8590682 0.0007970 0.7546579 0.9676299
eviction_count 0.0772791 0.0001721 0.0571873 0.1019295
uninsured_count -0.0117684 0.0000217 -0.0142757 -0.0081621

Negative Binomial

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \; \; \; \; \text{where} \log(\mu_{i}) = \beta_{0} + \sum^{11}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \end{split}\]

negbin_non_hierarchical <- stan_glm(
 sex_crime_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count,
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(negbin_non_hierarchical) + 
  xlab("Sex-Based Crime Count") +
  xlim(0,max(nyc_clean$sex_crime_count))+
  labs(title = "Negative Binomial")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_negbin <-  posterior_predict(
  negbin_non_hierarchical, newdata = nyc_predict_clean)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Non-Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 0.1412859 1.6773478 0.0205149 1.487618e+00
gini 83112.5360342 3.3742642 699.4828738 5.658686e+06
mean_rent -0.0690438 0.0005387 -0.1351984 -4.143100e-03
latinx_count 0.0047039 0.0000287 0.0007911 7.644200e-03
asian_count 0.0068104 0.0000284 0.0029518 9.905500e-03
transportation_desert_4catLimited 227.3774554 0.4375992 88.3039435 4.952490e+02
transportation_desert_4catExcellent 238.0095508 0.4822838 82.4135280 5.960894e+02
store_count 0.8100002 0.0069962 0.0412584 1.773830e+00
uninsured_count -0.0171355 0.0001329 -0.0318871 -1.290800e-03
table1 <- prediction_summary(model=poisson_non_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Poisson")
table2 <- prediction_summary(model=negbin_non_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Negative Binomial")

Negative binomial has much better error metrics.

Models 2 & 3: Hierarchy by Borough

We use poisson and negative binomial hierarchical regression to model observed white counts, \(i\), by boroughs \(j\), usin our \(k=11\) predictors. Note we are creating 3 dummy variables associated with transportation_desert_4cat:

  • mean_income
  • mean_rent
  • unemployment_count
  • transportation_desert_4cat
  • school_count
  • store_count
  • bus_count
  • eviction_count
  • uninsured_count

Poisson

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{ij}) \\ & \text{where} \log(\lambda_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \sigma_0 & \sim Exp(1) \end{split}\]

poisson_hierarchical <- stan_glmer(
  sex_crime_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count + (1 | borough),
  data = nyc_clean,
  family = poisson,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0
)
pp_check(poisson_hierarchical) + 
  xlab("Sex-Based Crime Count") +
  xlim(0,max(nyc_clean$sex_crime_count))+
  labs(title = "Poisson")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

library(kableExtra)

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(poisson_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchicahl Poisson - Model Summary") %>% 
  kable_styling()
Hierarchicahl Poisson - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 9088.3043289 18.5470740 0.0134379 2.953498e+09
unemployment_count -0.0126650 0.0000227 -0.0331440 -1.122810e-02
asian_count 0.0067423 0.0000331 0.0038719 1.001290e-02
bus_count 2.9602367 0.0349820 0.4975013 5.678168e+00

Negative Binomial

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \\ & \text{where} \log(\mu_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \sigma_0 & \sim Exp(1) \end{split}\]

negbin_hierarchical <- stan_glmer(
sex_crime_count ~ gini + mean_income + mean_rent + 
    unemployment_count + white_count + black_count + latinx_count + asian_count+
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count+ (1 | borough),
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 2, iter = 100*2, seed = 84735, refresh = 0)
pp_check(negbin_hierarchical) + 
  xlab("Sex-Based Crime Count") +
  xlim(0,max(nyc_clean$sex_crime_count))+
  labs(title = "Negative Binomial")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_negbin <-  posterior_predict(
  negbin_hierarchical, newdata = nyc_predict_clean)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.8) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.x = element_blank(),
        axis.text.x = element_blank())

tidy(negbin_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 5.871170e-02 2.2911940 0.0035598 1.023883e+00
gini 4.268212e+05 4.3865093 1551.1395970 4.938351e+07
asian_count 5.624600e-03 0.0000349 0.0017718 9.856000e-03
transportation_desert_4catLimited 2.305360e+02 0.4389344 97.0243687 4.840791e+02
transportation_desert_4catSatisfactory 7.448618e+01 0.4333771 9.7214513 2.025286e+02
transportation_desert_4catExcellent 2.902775e+02 0.4976055 120.8134610 5.857729e+02
store_count 1.091114e+00 0.0081662 0.0735059 2.042746e+00
bus_count 6.415640e-01 0.0052005 0.0561314 1.435678e+00

Models 4 & 5: Spatial

col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure
moran.mc(col_sp$sex_crime_count, listw = col_listw, nsim = 999, alternative = "greater")
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  col_sp$sex_crime_count 
## weights: col_listw  
## number of simulations + 1: 1000 
## 
## statistic = -0.04102, observed rank = 172, p-value = 0.828
## alternative hypothesis: greater

Poisson

col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure

M.burnin <- 10000       # Number of burn-in iterations (discarded)
M <- 1000               # Number of iterations retained


model.sex <- S.CARleroux(
  sex_crime_count ~ 1 + total_pop + gini + below_poverty_line_count, 
  data = nyc_clean, 
  family = "poisson",
  W = W,
  burnin = M.burnin,
  n.sample = M.burnin + M,    # Total iterations
  verbose = FALSE)

as.data.frame(model.sex$summary.results) %>% 
  rownames_to_column("term") %>%
  kable()%>% 
  kable_styling() %>% 
  scroll_box(width = "100%", height = "200px")
term Median 2.5% 97.5% n.sample % accept n.effective Geweke.diag
(Intercept) -2.5346 -3.5162 -1.4237 1000 44.9 8.3 -2.0
total_pop 0.0000 0.0000 0.0000 1000 44.9 1.3 1.2
gini 5.1898 1.8197 7.2130 1000 44.9 6.7 0.6
below_poverty_line_count 0.0000 0.0000 0.0001 1000 44.9 3.4 3.9
tau2 2.1539 1.7194 2.7787 1000 100.0 215.7 0.2
rho 0.0116 0.0006 0.0481 1000 46.5 101.4 0.6
p_plot <- round((moran.mc(x = as.vector(model.sex$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
  

nyc_clean %>%
  mutate(resid = model.sex$residuals$response) %>%
  ggplot(aes(fill = resid), color = "#8f98aa") +
  geom_sf()+
  scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
                         guide_legend(title = "Residual")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle(paste0("Sex-Based Violence: \nPoisson Model Residuals (", p_plot, ")"))+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 25, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

Zero-Inflated Poisson (Negative Binomial)

col_sp <- as(nyc_clean, "Spatial")
col_nb <- poly2nb(col_sp) # queen neighborhood
col_listw <- nb2listw(col_nb, style = "B") # listw version of the neighborhood
W <- nb2mat(col_nb, style = "B") # binary structure

M.burnin <- 10000       # Number of burn-in iterations (discarded)
M <- 1000               # Number of iterations retained


model.sex <- S.CARleroux(
  sex_crime_count ~ 1 + total_pop + gini + below_poverty_line_count, 
  data = nyc_clean, 
  family = "zip",
  formula.omega = ~1,
  W = W,
  burnin = M.burnin,
  n.sample = M.burnin + M,    # Total iterations
  verbose = FALSE)

as.data.frame(model.sex$summary.results) %>% 
  rownames_to_column("term") %>%
  kable()%>% 
  kable_styling() %>% 
  scroll_box(width = "100%", height = "200px")
term Median 2.5% 97.5% n.sample % accept n.effective Geweke.diag
(Intercept) -2.0063 -4.1039 -0.7230 1000 40.1 5.3 -9.5
total_pop 0.0000 0.0000 0.0000 1000 40.1 3.1 -3.0
gini 4.8409 2.4735 9.9535 1000 40.1 5.1 11.7
below_poverty_line_count 0.0001 0.0000 0.0001 1000 40.1 2.5 0.4
omega - (Intercept) -12914.7514 -12914.7514 -12914.7514 1000 0.0 0.0 NaN
tau2 2.2393 1.7625 2.9347 1000 100.0 155.8 -0.9
rho 0.0076 0.0001 0.0473 1000 40.4 92.9 -1.1
p_plot <- round((moran.mc(x = as.vector(model.sex$residuals$response), listw = col_listw, nsim = 9999, alternative = "greater")$p.value),5)
  

nyc_clean %>%
  mutate(resid = model.sex$residuals$response) %>%
  ggplot(aes(fill = resid), color = "#8f98aa") +
  geom_sf()+
  scale_fill_gradientn(colors=c("#fef6ef","#f07f9b","#e94a7c","#d61760"), guide =
                         guide_legend(title = "Residual")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
ggtitle(paste0("Sex-Based Violence: \nZIP Model Residuals (", p_plot, ")"))+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 25, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

Comparison

table3 <- prediction_summary(model=poisson_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Hierarchichal Poisson")
table4 <- prediction_summary(model=negbin_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Hierarchichal Negative Binomial") 

rbind(table1, table2, table3, table4) %>%
  dplyr::mutate(model = Model, .before=1) %>%
  dplyr::select(-Model)%>% kable() %>% kable_styling()
model mae mae_scaled within_50 within_95
Poisson 9.8175 2.4370585 0.1000000 0.4666667
Negative Binomial 13.1000 0.5442446 0.5666667 0.9833333
Hierarchichal Poisson 9.7000 0.9074845 0.4833333 0.6833333
Hierarchichal Negative Binomial 13.1975 0.5221313 0.5833333 1.0000000

Using in-sample scaled MAE, it’s evident our negative binomial regression models, regardless of hierarchy, performed better than our poisson regression models. However, the differences between the two negative binomial models was neglible. So, we will more closely interpret the non-hierarchichal negative binomial regression model.

tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Non-Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 0.1412859 1.6773478 0.0205149 1.487618e+00
gini 83112.5360342 3.3742642 699.4828738 5.658686e+06
mean_rent -0.0690438 0.0005387 -0.1351984 -4.143100e-03
latinx_count 0.0047039 0.0000287 0.0007911 7.644200e-03
asian_count 0.0068104 0.0000284 0.0029518 9.905500e-03
transportation_desert_4catLimited 227.3774554 0.4375992 88.3039435 4.952490e+02
transportation_desert_4catExcellent 238.0095508 0.4822838 82.4135280 5.960894e+02
store_count 0.8100002 0.0069962 0.0412584 1.773830e+00
uninsured_count -0.0171355 0.0001329 -0.0318871 -1.290800e-03

After removing predictors whose 95% credible intervals included the possibility of non-effect when controlling for other covariates, we found that there were 6 remaining predictors of an arbitrary neighborhood’s sex-based crime counts:

  • Mean income by neighborhood
  • Unemployment counts by neighborhood
  • Transportation desert status by neighborhood
  • Food Vendor counts by neighborhood
  • Bus station counts by neighborhood
  • Eviction counts by neighborhood.

Next, we interpret each predictor:

  • Mean Income: When controlling for all other predictors, 1 dollar increases in mean neighborhood rental prices are associated with approximately 0.00143% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.000330%, 0.00268%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
  • Unemployment Count: When controlling for all other predictors, 1 dollar increases in unemployed people counts by neighborhood are associated with approximately 0.0493% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0140%, 0.0790%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Limited Subway Access: When controlling for all other predictors, a neighborhood with limited access to subway transit is expected to have approximately 125.753% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (51.508%, 269.541%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Satisfactory Subway Access: When controlling for all other predictors, a neighborhood with satisfactory access to subway transit is expected to have approximately 136.342% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (44.081%, 282.669%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Excellent Subway Access: When controlling for all other predictors, a neighborhood with excellent access to subway transit is expected to have approximately 98.186% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (15.762%, 275.969%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Food Vendor Count: When controlling for all other predictors, 1 store increases in the number of food vendors by neighborhood are associated with approximately 0.759% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.261%, 1.469%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Bus Stop Count: When controlling for all other predictors, 1 stop increases in the number of bus stop by neighborhood are associated with approximately 0.6516973% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0402706%, 1.1496164%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Eviction Count: When controlling for all other predictors, increases in the number of evictions by neighborhood are associated with approximately 0.340% decreases in the white population count. However, there is a 95% chance that the relationship may be any value between (-0.425%, -0.248%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.

If we intended to essentialize this list, transportation_desert_4, eviction_count, store_count seem to be the most informative predictors.

Next Steps

We intend to adjust our model specifications and reconsider the construction and inclusion of our predictors. For example, we may want to rethink how we’re making our transit-access variable, whether including car ownership or transit use, and what predictors we’re specifically using in our models.

Moving forward, we want to build some non-hierarchical models predicting subway and bus count using the demographic data (like median income, rent, percentage white population, etc) to see the flip-side of what we are looking at right now with our models above.

Our current hierarchical models attempt to capture a spatial interaction by borough, but we may extend this hierarchy to do census-tracts within neighborhoods, so as to unnderstand observed differences at a finer level. Further, we may include latitude and longitude variables into our non-grouped model to account for spatial relationships and avoid hierarchy altogether. However, we may use CARBayes to adjust for the spatial factors.







Participation

  • Sam Ding: Pulled and prepared subway ridership and access data, then harmonized transit data with existing neighborhood data.
  • Vichy Meas: Helped clean both transit and demographic data, scaffolded the checkpoint 3 responses, and helped plan out transit questions.
  • Freddy Barragan: Pulled and prepared grocery, bus stop, school, eviction, and census data, made exploratory visualizations by neighborhood, built/coded our proposed demographic models.
  • Juthi Dewan: Harmonized, cleaned, and joined all disparate datasets, prepared our data summaries, built/coded our proposed demographic models.